This R Markdown document reproduces the main analyses and figures for tumor evolution in high-clonality LUAD (Lung Adenocarcinoma) dataset from the Sherlock-Lung study
manuscript title: Deciphering lung adenocarcinoma evolution and the role of LINE-1 retrotransposition
Fig. 5: Activation of germline L1 retrotransposition due to DNA demethylation of L1 promoter.
(You may need to install some packages if missing)
library(tidyverse)
library(ggplot2)
library(ggsankey)
library(scales)
library(hrbrthemes) # For theme_ipsum_rc
library(cowplot)
library(forcats)
library(ggrepel)
library(ggnewscale)
library(data.table)
library(ggasym)
library(ggpmisc)
library(broom)
library(ggsci)
library(ggpubr)
library(hablar)
library(conflicted)
library(valr)
conflicts_prefer(dplyr::filter)
conflicts_prefer(dplyr::rename)
conflicts_prefer(ggplot2::annotate)
# --- Define Helper Functions (if any custom) -----------------------------------
# Please source your custom functions here if needed, or place them in ./functions/
# source('./functions/your_custom_functions.R')
# --- Load Input Datasets ------------------------------------------------------
# All data files should be placed in the appropriate subfolders.
# Example folder structure:
# ./data/ - for main input files
# ./functions/ - for custom R functions
# ./output/ - for saving plots and results
# Replace the filenames below with your actual dataset filenames.
# load Sherlock-lung data -------------------------------------------------
load('./data/BBsolution_final3_short.RData')
# load Sherlock-lung data -------------------------------------------------
load('./data/BBsolution_final3_short.RData',verbose = T)
## Loading objects:
## BBsolution
## BBsolution4
## hq_samples
## wgs_groups_info
## wgs_groups_info2
load('./data/covdata0.RData',verbose = T)
## Loading objects:
## covdata0
load('./data/clinical_data.RData',verbose = T)
## Loading objects:
## clinical_data
load('./data/sherlock_data_all.RData',verbose = T)
## Loading objects:
## sherlock_data
## sherlock_data_full
## sherlock_overall
## sherlock_type_colors
## sherlock_freq
## sherlock_freq_hq
load('./data/sherlock_variable.RData',verbose = T)
## Loading objects:
## sherlock_variable
load('./data/sp_group_data.RData',verbose = T)
## Loading objects:
## sp_group_color
## sp_group_color_new
## sp_group_data
## sp_group_data2
load('./data/id2data.RData',verbose = T)
## Loading objects:
## id2data
## id2color
load('./data/tedata.RData',verbose = T)
## Loading objects:
## tedata
load('./data/RNASeq_Exp.RData',verbose = T)
## Loading objects:
## cdata3
## rdata1
## adata
## analysis limited to luad only
hq_samples2 <- covdata0 %>% filter(Histology == 'Adenocarcinoma', Tumor_Barcode %in% hq_samples) %>% pull(Tumor_Barcode)
rm(list=c('hq_samples'))
Diagram illustrating the transposon mobilization mechanism for long interspersed element 1 (L1). Adapted from a publication by Levin and Moran58, this mechanism depicts non-LTR retrotransposons mobilizing through target-site-primed reverse transcription (TPRT). ORF2-encoded endonuclease generates a single-strand ‘nick’ in genomic DNA, freeing a 3′-OH used to prime RNA reverse transcription. Demethylated CpG in the L1 promoter region (top purple arrow) is hypothesized to activate L1 retrotransposition. Endonuclease activity, coupled with DNA repair mechanisms, might lead to one-base pair deletions or insertions at polymer A/T regions.
The polar plot represents the median methylation level across the genome locations of the CpG island on chr22q12.1, stratified by normal lung samples (N=80), tumor samples without ID2 signature (N=40), and tumors samples with ID2 signature (N=40). Control samples were designed to represent 0%, 33.3%, 66.6%, and 100% methylation levels at each CpG site as shown on the y-axis.
# load the data -----------------------------------------------------------
tmp <- readxl::read_xlsx('data/Sherlock data MC (Methylation levels){BE}.xlsx',sheet = 1,col_names = T,n_max = 1)
tmp <- tmp %>%
pivot_longer(cols = -c(`Region Chr22:`, `...2` )) %>%
select(Pos=name,CpG_ID=value)
tmp2 <- readxl::read_xlsx('data/Sherlock data MC (Methylation levels){BE}.xlsx',sheet = 1,col_names = T,skip = 1)
mdata <- tmp2 %>%
pivot_longer(cols = -c(`CpG n.`,`Tumor/Normal/Control` )) %>%
rename(Barcode=`CpG n.`, Type=`Tumor/Normal/Control`,CpG_ID=name,Beta=value) %>%
filter(str_detect(Barcode,'_')) %>%
left_join(tmp)
mdata <- mdata %>%
mutate(Sherlock_PID=str_remove(Barcode,'^[0-9]*_')) %>%
mutate(Sherlock_PID=str_remove(Sherlock_PID,'_[TN]$')) %>%
filter(!CpG_ID %in% c('Tum met cluster','Normal Tissue Met cluster')) %>%
mutate(Sherlock_PID = if_else(Sherlock_PID =='HOKO-153','HOKO-0153',Sherlock_PID)) %>%
mutate(Sherlock_PID = if_else(Sherlock_PID =='ITLU_0462','ITLU-0462',Sherlock_PID)) %>%
mutate(Sherlock_PID = if_else(Sherlock_PID =='TWAIN-0016','TWAN-0016',Sherlock_PID)) %>%
mutate(Sherlock_PID = if_else(Sherlock_PID =='TWAIN-0042','TWAN-0042',Sherlock_PID)) %>%
left_join(wgs_groups_info %>% select(Sherlock_PID,Subject)) %>%
mutate(Subject = if_else(Barcode=='64_100_PERCENT','100% methylated',Subject)) %>%
mutate(Subject = if_else(Barcode=='63_66.6_PERCENT','66.6% methylated_3',Subject)) %>%
mutate(Subject = if_else(Barcode=='62_66.6_PERCENT','66.6% methylated_2',Subject)) %>%
mutate(Subject = if_else(Barcode=='61_66.6_PERCENT','66.6% methylated_1',Subject)) %>%
mutate(Subject = if_else(Barcode=='60_33.3_PERCENT','33.3% methylated',Subject)) %>%
mutate(Subject = if_else(Barcode=='59_UNMETHYLATED','0% methylated',Subject)) %>%
mutate(Beta=as.numeric(Beta)) %>%
mutate(Pos=as.integer(Pos)) %>%
mutate(Type=if_else(Type=='T','Tumor',Type)) %>%
mutate(Type=if_else(Type=='N','Normal',Type)) %>%
mutate(Type=if_else(Type=='C','Control',Type))
mdata_control <- mdata %>% filter(Type=='Control')
tmp3 <- readxl::read_xlsx('./data/Sherlock data MC 2024-2023 (Methylation levels){BE}.xlsx',sheet = 2,col_names = T,skip = 2)
mdata <- tmp3 %>%
pivot_longer(cols = -c(`Year of analysis`,`Sample name`,`Tumor/Normal/Control` )) %>%
select(-`Year of analysis`) %>%
rename(Barcode=`Sample name`, Type=`Tumor/Normal/Control`,Pos=name,Beta=value)
mdata <- mdata %>%
mutate(Sherlock_PID=str_remove(Barcode,'^[0-9]*_')) %>%
mutate(Sherlock_PID=str_remove(Sherlock_PID,'_[TN]$')) %>%
mutate(Sherlock_PID=str_remove(Sherlock_PID,'[TN]$')) %>%
mutate(Sherlock_PID=str_replace(Sherlock_PID,'_','-')) %>%
#filter(!CpG_ID %in% c('Tum met cluster','Normal Tissue Met cluster')) %>%
mutate(Sherlock_PID = if_else(Sherlock_PID =='HOKO-153','HOKO-0153',Sherlock_PID)) %>%
mutate(Sherlock_PID = if_else(Sherlock_PID =='HOKO-004','HOKO-0004',Sherlock_PID)) %>%
mutate(Sherlock_PID = if_else(Sherlock_PID =='HOKO-009','HOKO-0009',Sherlock_PID)) %>%
mutate(Sherlock_PID = if_else(Sherlock_PID =='ITLU_0462','ITLU-0462',Sherlock_PID)) %>%
mutate(Sherlock_PID = if_else(Sherlock_PID =='TWAIN-0016','TWAN-0016',Sherlock_PID)) %>%
mutate(Sherlock_PID = if_else(Sherlock_PID =='TWAIN-0042','TWAN-0042',Sherlock_PID)) %>%
left_join(wgs_groups_info %>% select(Sherlock_PID,Subject)) %>%
# mutate(Subject = if_else(Barcode=='64_100_PERCENT','100% methylated',Subject)) %>%
# mutate(Subject = if_else(Barcode=='63_66.6_PERCENT','66.6% methylated_3',Subject)) %>%
# mutate(Subject = if_else(Barcode=='62_66.6_PERCENT','66.6% methylated_2',Subject)) %>%
# mutate(Subject = if_else(Barcode=='61_66.6_PERCENT','66.6% methylated_1',Subject)) %>%
# mutate(Subject = if_else(Barcode=='60_33.3_PERCENT','33.3% methylated',Subject)) %>%
# mutate(Subject = if_else(Barcode=='59_UNMETHYLATED','0% methylated',Subject)) %>%
mutate(Beta=as.numeric(Beta)) %>%
mutate(Pos=as.integer(Pos)) %>%
mutate(Type=if_else(Type=='T','Tumor',Type)) %>%
mutate(Type=if_else(Type=='N','Normal',Type)) %>%
mutate(Type=if_else(Type=='C','Control',Type))
mdata <- mdata %>% left_join(
mdata_control %>% select(CpG_ID,Pos) %>% unique()
)
mdata <- bind_rows(mdata_control,mdata)
#save(mdata,file='CGW_data.RData')
## new limitation to hqsamples for luad only
tmpdata <- wgs_groups_info %>%
left_join(sp_group_data2) %>%
left_join(id2data) %>%
select(Tumor_Barcode,Normal_Barcode,Subject,SP_Group_New,ID2,ID2_ratio,ID2_Present)
mdata <- mdata %>% left_join(tmpdata) %>% mutate(Sample=if_else(Type=='Tumor',Tumor_Barcode,if_else(Type=='Normal',Normal_Barcode,Subject)))
#
# ## Absent 6 and Present 23 have been selected before, no we need to select addtional samples for the sequecing
# tmp <- tmpdata %>% filter(Subject %in% hq_samples2, !(Subject %in% mdata$Subject))
#
# tmp %>% filter(ID2_Present=="Present") %>% arrange(desc(ID2)) %>% select(Tumor_Barcode,ID2,ID2_ratio,ID2_Present)->select1
# select1 <- select1 %>% left_join(wgs_groups_info) %>%
# filter(Study!='Public-WGS') %>%
# select(Subject,Subject_ID,Sherlock_PID,Tumor_Barcode,Tumor_Sample_ID,Normal_Barcode,Normal_Sample_ID,contains("ID2"))
#
# tmp %>% filter(ID2_Present=="Absent") %>% arrange((ID2)) %>% select(Tumor_Barcode,ID2,ID2_ratio,ID2_Present)->select2
# select2 <- select2 %>% left_join(wgs_groups_info) %>%
# filter(Study!='Public-WGS') %>%
# select(Subject,Subject_ID,Sherlock_PID,Tumor_Barcode,Tumor_Sample_ID,Normal_Barcode,Normal_Sample_ID,contains("ID2"))
#
# bind_rows(select1,select2) %>%
# write_csv('additional_selelection.csv',col_names = T)
# Analysis ----------------------------------------------------------------
# phase plot
# tmp <- mdata %>%
# filter(Type=='Normal') %>%
# select(Sample,Pos,Beta) %>%
# pivot_wider(names_from = Pos,values_from = Beta)
#
# tmpm <- as.matrix(tmp[,-1])
# rownames(tmpm) <- tmp$Sample
# tmpclust <- hclust(dist(tmpm))
# order1 <- rev(rownames(tmpm)[tmpclust$order])
tmp <- mdata %>%
filter(Type=='Tumor',ID2_Present=='Absent') %>%
select(Sample,Pos,Beta) %>%
pivot_wider(names_from = Pos,values_from = Beta)
tmpm <- as.matrix(tmp[,-1])
rownames(tmpm) <- tmp$Sample
tmpclust <- hclust(dist(tmpm))
order2 <- (rownames(tmpm)[tmpclust$order])
tmp <- mdata %>%
filter(Type=='Tumor',ID2_Present=='Present') %>%
select(Sample,Pos,Beta) %>%
pivot_wider(names_from = Pos,values_from = Beta)
tmpm <- as.matrix(tmp[,-1])
rownames(tmpm) <- tmp$Sample
tmpclust <- hclust(dist(tmpm))
order3 <- (rownames(tmpm)[tmpclust$order])
order1 <- tibble(Tumor_Barcode=c(order3,order2)) %>% left_join(wgs_groups_info) %>% pull(Normal_Barcode)
samleves <- c(rev(c('100% methylated','66.6% methylated_1','66.6% methylated_2','66.6% methylated_3','33.3% methylated','0% methylated')),order1,order3,order2)
mdata <- mdata %>% mutate(Sample=factor(Sample,levels=samleves)) %>% arrange(Sample)
mdata <- mdata %>%
mutate(ID2=if_else(Type=='Normal',NA_integer_,ID2)) %>%
mutate(ID2_Present=if_else(Type=='Normal',NA_character_,ID2_Present))
myggstyle()
mdata %>%
#filter(Type=='Tumor') %>%
ggplot(aes(Pos,Sample,fill=Beta))+
geom_point(pch=21,size=2)+
scale_fill_viridis_c(limits = c(0, 1))+
scale_x_continuous(breaks = sort(unique(mdata$Pos)),labels = sort(unique(mdata$Pos)),expand = c(0,0))+
facet_grid(Type~.,scales = 'free_y',space='free')+
theme_ipsum_rc(axis_title_just = 'm',ticks = T,axis = 'Y')+
theme(axis.text.x = element_text(angle = 30,hjust = 1,vjust = 1),axis.text.y = element_text(size=7),panel.spacing = unit(0.5,'cm'),legend.position = 'top',legend.key.width = unit(2.5,'cm'), panel.grid.minor.x = element_blank())+
labs(x='',y='',fill='Methylation beta value\n')+
ggnewscale::new_scale_fill()+
geom_tile(aes(x=29059215,Sample,fill=log2(ID2+1),width=5))+
scale_fill_viridis_c(option = 'A')+
ggnewscale::new_scale_fill()+
geom_tile(aes(x=29059210,Sample,fill=SP_Group_New,width=5))+
scale_fill_manual(values = sp_group_color_new)+
coord_cartesian(clip = 'off')
#ggsave(filename = './output/methylation_haplotype_tmp.pdf',width = 22,height = 14,device = cairo_pdf)
# Difference among difference groups
# tumor vs normal
tmpx <- mdata %>%
filter(Type!='Control') %>%
select(Subject,Type,Pos,Beta) %>%
group_by(Pos) %>%
do(tidy(wilcox.test(Beta~Type,data=.))) %>%
ungroup() %>%
select(Pos,p.value) %>%
mutate(Type="Tumor",Beta=0)
p1 <- mdata %>%
filter(Type!='Control') %>%
select(Subject,Type,Pos,Beta) %>%
ggplot(aes(as.factor(Pos),Beta,fill=Type))+
geom_point(pch=21,size=2,position=position_jitterdodge(jitter.width = 0.1,dodge.width = 0.8),stroke=0.5,col='black')+
geom_boxplot(width=0.8,pch=21,outlier.shape = NA,alpha=0.3)+
scale_fill_d3()+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid = 'Xx')+
labs(x = "Chromsome 22 coordinate", y = 'Methylation beta value')+
panel_border(color = 'black',linetype = 1)+
theme(axis.text.x = element_text(angle = 90,hjust = 0,vjust = 0.5))
p1
p2 <- tmpx %>%
ggplot(aes(as.factor(Pos),0,fill=-log10(p.value)))+
geom_tile(height=0.5)+
scale_fill_viridis_c()+
theme_minimal(base_family = 'Roboto Condensed')+
theme(axis.title = element_blank(),axis.text = element_blank(),legend.position = 'top',panel.grid = element_blank(),legend.key.width = unit(2.5,'cm'))+
labs(fill='Tumor vs. Normal, -log10(P)\n')
p2
plot_grid(p1,p2,align = 'v',axis = 'lr',rel_heights = c(6,1),ncol = 1)
#ggsave(filename = './output/methylation_haplotype_tumor_normal.pdf',width = 16,height = 7,device = cairo_pdf)
tmpx <- mdata %>%
filter(Type=='Tumor') %>%
select(Subject,ID2_Present,Pos,Beta) %>%
group_by(Pos) %>%
do(tidy(wilcox.test(Beta~ID2_Present,data=.))) %>%
ungroup() %>%
select(Pos,p.value) %>%
arrange(p.value)
p1 <- mdata %>%
filter(Type=='Tumor') %>%
select(Subject,ID2_Present,Pos,Beta) %>%
ggplot(aes(as.factor(Pos),Beta,fill=ID2_Present))+
geom_point(pch=21,size=2,position=position_jitterdodge(jitter.width = 0.1,dodge.width = 0.8),stroke=0.5,col='black')+
geom_boxplot(width=0.8,pch=21,outlier.shape = NA,alpha=0.3)+
scale_fill_nejm()+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid = 'Xx')+
labs(x = "Chromsome 22 coordinate", y = 'Methylation beta value')+
panel_border(color = 'black',linetype = 1)+
theme(axis.text.x = element_text(angle = 90,hjust = 0,vjust = 0.5))
p1
p2 <- tmpx %>%
ggplot(aes(as.factor(Pos),0,fill=-log10(p.value)))+
geom_tile(height=0.5)+
scale_fill_viridis_c()+
theme_minimal(base_family = 'Roboto Condensed')+
theme(axis.title = element_blank(),axis.text = element_blank(),legend.position = 'top',panel.grid = element_blank(),legend.key.width = unit(2.5,'cm'))+
labs(fill='Tumor vs. Normal, -log10(P)\n')
p2
plot_grid(p1,p2,align = 'v',axis = 'lr',rel_heights = c(6,1),ncol = 1)
#ggsave(filename = './output/methylation_haplotype_tumor_ID2.pdf',width = 16,height = 7,device = cairo_pdf)
## boxplots for the group
my_comparisons <- list(c('Normal','Tumor (ID2 Present)'),c('Normal','Tumor (ID2 Absent)'),c('Tumor (ID2 Absent)','Tumor (ID2 Present)'))
mdata %>%
mutate(Group=case_when(
Type=='Normal' ~ 'Normal',
Type=='Tumor' & ID2_Present == 'Present' ~ 'Tumor (ID2 Present)',
Type=='Tumor' & ID2_Present == 'Absent' ~ 'Tumor (ID2 Absent)',
TRUE ~ NA
)) %>%
filter(!is.na(Group)) %>%
select(Sample,Group,Beta,CpG_ID,Pos) %>%
group_by(Group,Sample,) %>%
summarise(Beta=median(Beta,na.rm=T)) %>%
ungroup() %>%
ggplot(aes(Group,Beta,fill=Group))+
geom_point(aes(fill=Group),pch=21,size=2,position = position_jitter(width = 0.15,height = 0),color="black",stroke = 0.1)+
geom_boxplot(width=0.5,outlier.shape = NA,alpha =0.25,fill=NA)+
scale_fill_manual(values =pal_jama()(3)[c(3,1,2)])+
scale_y_continuous(breaks = pretty_breaks(n = 7))+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
theme(plot.margin = margin(4,4,4,4),panel.spacing = unit(0.1,'cm'),axis.text.x = element_text(angle = 45,hjust = 1,vjust = 1))+
labs(x = "", y = 'Methylation beta value')+
guides(fill="none")+
panel_border(color = 'black',linetype = 1)+
stat_compare_means(comparisons = my_comparisons)
#ggsave(filename = './output/methylation_group_comparaison.pdf',width = 2.6,height = 7,device = cairo_pdf)
# linear relationship
mdata %>%
filter(Type=='Tumor', ID2>0) %>%
select(Sample,ID2,Beta,CpG_ID) %>%
group_by(Sample,ID2) %>%
summarise(Beta=median(Beta,na.rm=T)) %>%
ungroup() %>%
ggplot(aes(Beta,log2(ID2)))+
geom_point(pch=21,stroke=0.5,fill=ncicolpal[2],size=2.5)+
geom_smooth(method = 'lm')+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid = 'Y',strip_text_face = 'bold',ticks = T)+
labs(x='Methylation beta value',y='Number of ID2 deletions (log2)')+
theme(panel.spacing = unit(0.2,'lines'),strip.text.x = element_text(face = 'plain',hjust = 0.5))+
coord_cartesian(clip = 'off')+
panel_border(color = 'black',linetype = 1,size=0.5)+
guides(fill="none")
#ggsave(filename = './output/methylation_ID2_correlation.pdf',width = 4.5,height = 4,device = cairo_pdf)
mdata %>%
filter(Type=='Tumor', ID2>0) %>%
select(Sample,ID2,Beta,CpG_ID) %>%
group_by(Sample,ID2) %>%
summarise(Beta=median(Beta,na.rm=T)) %>%
ungroup() %>%
do(tidy(cor.test(.$Beta,log2(.$ID2),data=.))) %>%
ungroup() %>%
arrange(p.value)
## # A tibble: 1 × 8
## estimate statistic p.value parameter conf.low conf.high method alternative
## <dbl> <dbl> <dbl> <int> <dbl> <dbl> <chr> <chr>
## 1 -0.254 -1.72 0.0923 43 -0.510 0.0428 Pearson's… two.sided
The polar plot represents the median methylation level across the genome locations of the CpG island on chr22q12.1, stratified by normal lung samples (N=80), tumor samples without ID2 signature (N=40), and tumors samples with ID2 signature (N=40). Control samples were designed to represent 0%, 33.3%, 66.6%, and 100% methylation levels at each CpG site as shown on the y-axis. c) Box plot shows DNA median methylation levels across the genome locations of the CpG island on chr22q12.1, stratified by the sample type and ID2 status.
# Polar plots
pdata <- mdata %>%
mutate(Group=case_when(
Type=='Normal' ~ 'Normal',
Type=='Tumor' & ID2_Present == 'Present' ~ 'Tumor (ID2 Present)',
Type=='Tumor' & ID2_Present == 'Absent' ~ 'Tumor (ID2 Absent)',
TRUE ~ NA
)) %>%
filter(!is.na(Group)) %>%
select(Sample,Group,Beta,CpG_ID,Pos) %>%
group_by(Group,Pos) %>%
summarise(Beta=median(Beta,na.rm=T)) %>%
ungroup() %>%
arrange(Pos) %>%
mutate(Pos=fct_inorder(as.factor(comma_format()(Pos))))
pdata %>%
ggplot(aes(x = factor(Pos), y = Beta, group = Group,color=Group)) +
ylim(0, NA)+
geom_point(stat = 'identity',size=2) +
geom_line()+
scale_y_continuous(limits = c(0.016,1),trans='log2')+
coord_polar(start = - pi * 1/24)+
theme_ipsum_rc(base_size = 14,ticks = FALSE,grid = "Xx")+
scale_color_manual(values = pal_jama()(3)[c(3,1,2)])+
theme(axis.text.y = element_blank(),axis.title.x = element_blank(),axis.title.y = element_blank(),panel.grid = element_line(linetype = 2),axis.text.x = element_text(size = 10),plot.margin = margin(4,4,4,4),legend.position = 'top')+
geom_hline(yintercept = seq(0,1,by=0.2), color = "gray90",size=0.15) +
annotate('text', x = 1, y = c(0.0243,0.217,0.475,0.971),label=c('0%','33%','66%','100%'),size=4,col='#542788')
#annotate('text', x = 1, y =seq(0,1,by=0.2),label=seq(0,1,by=0.2),size=4,col='#542788')+
#guides(color="none")
#ggsave(filename = './output/methylation_haplotype_polar.pdf',width = 5.5,height = 5.5,device = cairo_pdf)
Pearson correlation coefficients and corresponding p-values are displayed.
# Methylation data and ZNF695 expression ----------------------------------
pdata <- mdata %>%
filter(Type!='Control') %>%
select(Subject,Type,Beta,CpG_ID,Pos) %>%
group_by(Subject,Type) %>%
summarise(Beta=median(Beta,na.rm=T)) %>%
ungroup() %>%
left_join(wgs_groups_info %>% select(Subject,Tumor_Barcode)) %>%
left_join(
rdata1 %>% filter(Gene=='ZNF695') %>% select(Tumor_Barcode,Exp,Type=RNAseq_Type)
)
pdata %>%
#filter(Tumor_Barcode %in% hq_samples2) %>%
group_by(Type) %>%
do(tidy(cor.test(.$Exp, .$Beta))) %>%
arrange(p.value)
## # A tibble: 2 × 9
## # Groups: Type [2]
## Type estimate statistic p.value parameter conf.low conf.high method
## <chr> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <chr>
## 1 Tumor -0.349 -2.77 0.00773 55 -0.559 -0.0977 Pearson's prod…
## 2 Normal -0.0251 -0.163 0.872 42 -0.320 0.274 Pearson's prod…
## # ℹ 1 more variable: alternative <chr>
pdata %>%
ggplot(aes(Exp,Beta,fill=Type))+
geom_point(pch=21,size=3)+
stat_smooth(aes(fill = Type, color = Type), method = "lm") +
stat_cor(size=5,col=ncicolpal[1]) +
facet_wrap(~Type,scales = 'free')+
#stat_regline_equation(label.y = c(60,64),label.x = c(4,4)) +
scale_fill_manual(values = pal_jama()(3)[c(3,2)]) +
scale_color_manual(values = pal_jama()(3)[c(3,2)]) +
scale_y_continuous(breaks = pretty_breaks(n = 5))+
scale_x_continuous(breaks = pretty_breaks(n = 7))+
#ggh4x::facet_grid2(TL~Subtype,scale='free',independent = 'y')+
labs(x='ZNF695 RNA-Seq expression log2(CPM)',y='22q12.1 CpG probes\nmedian methylation beta value')+
theme_ipsum_rc(axis_text_size = 14,axis_title_just = 'm',axis_title_size = 16,grid = 'XY',ticks = T)+
panel_border(color = 'black')+
theme(#axis.text.x = element_blank(),
panel.spacing = unit(0.4,'cm'),
#axis.ticks.x = element_blank(),
plot.margin = margin(4,4,4,4),
legend.position = 'bottom',
legend.text = element_text(size = 14),
legend.title = element_text(size = 14),
strip.text = element_text(hjust = 0.5,face = 'bold',size = 13)
)+
guides(fill='none',col='none')
#ggsave(filename = './output/chr22_cpg_methyaltion_znf695_exp.pdf',width = 10,height =4,device = cairo_pdf)
# # Read L1EM result
# l1em <- read_delim('./data/filter_L1HS_FPM.txt',delim = '\t',col_names = T)
# l1em <- l1em %>%
# mutate(tmp=family.category.locus.strand) %>%
# separate(col = tmp,into = c('family','category','locus','strand'),sep = '\\.') %>%
# mutate(strand=str_sub(strand,1,1))
#
# # Cytoband
#
# cytoband <- read_delim('./data/cytoBand.txt.gz',delim = '\t',col_names = F)
# colnames(cytoband) <- c('chrom','start','end','cytoband','region')
# cytoband <- cytoband %>%mutate(cytoband=paste0(str_remove(chrom,'chr'),cytoband))
#
# tmp <- l1em %>%
# select(locus) %>%
# unique() %>%
# mutate(tmp=locus) %>%
# separate(col = tmp,into = c('chrom','start','end'),convert = T) %>%
# select(chrom:end,locus)
#
# tmp <- bed_intersect(tmp,cytoband) %>%
# select(locus=locus.x,cytoband=cytoband.y) %>%
# group_by(locus) %>%
# summarise(cytoband=paste0(cytoband,collapse = '-'))
#
# l1em <- l1em %>% left_join(tmp)
#
# l1em_all <- l1em %>% mutate(L1exp=only + `3prunon`, only_ratio=only/L1exp) %>% left_join(cdata)
# l1em <- l1em_all %>% filter(L1exp>2,only_ratio>0.9)
#
# save(l1em,l1em_all,file='L1EM.RData')
load('./data/L1EM.RData')
# Landscape of L1EM in tumor and normal in all available dataset-----------------------------------
library(tidyHeatmap)
library(circlize)
library(ComplexHeatmap)
tdata <- l1em %>%
group_by(Sample, cytoband,dataset,RNAseq_Type) %>%
summarise(L1exp=sum(L1exp)) %>%
mutate(L1exp=log2(L1exp)) %>%
ungroup()
cytoband_list <- tdata %>% count(cytoband,sort=T) %>% filter(n>20) %>% pull(cytoband)
tdata <- tdata %>% filter(cytoband %in% cytoband_list) %>% complete(nesting(Sample,dataset,RNAseq_Type),cytoband,fill = list(L1exp=0))
#cairo_pdf(file = './output/L1-Exp-RNAseq-all-tumor.pdf',width = 9,height = 10,family = 'Roboto Condensed')
tdata %>%
#filter(RNAseq_Type =='Tumor') %>%
filter(RNAseq_Type =='Normal') %>%
group_by(dataset) %>%
heatmap(Sample,cytoband,L1exp,
#palette_value = circlize::colorRamp2(c(seq(0,20,4),40,60),c('white',viridis::viridis(7))),
palette_value = circlize::colorRamp2(seq(0,6),c('white',viridis::viridis(6))),
rect_gp = grid::gpar(col = "#161616", lwd = 0.1),
column_names_gp = gpar(fontsize = 10),
show_row_names=FALSE,column_title='',row_title='',
heatmap_legend_param=list(at=seq(1,6),labels=2^(seq(1,6)),title= 'LINE-1 RNA (FPM)', title_position = "leftcenter-rot",legend_height=unit(6,"cm")),
palette_grouping = list(ncicolpal[c(1:2,4)])
) #annotation_tile(dataset)
#dev.off()
# Landscape of 1217 sample only ---------------------------------------------
tdata <- l1em %>%
select(-dataset,-RNAseq_Type) %>%
filter(Sample %in% cdata3$Sample) %>%
left_join(cdata3) %>%
left_join(wgs_groups_info %>% select(Tumor_Barcode,SP_Group) %>% left_join(sp_group_data2) %>% select(-SP_Group)) %>%
group_by(Sample, cytoband,SP_Group_New,RNAseq_Type) %>%
summarise(L1exp=sum(L1exp)) %>%
mutate(L1exp=log2(L1exp)) %>%
ungroup()
cytoband_list <- tdata %>% count(cytoband,sort=T) %>% filter(n>15) %>% pull(cytoband)
tdata <- tdata %>% filter(cytoband %in% cytoband_list) %>% complete(nesting(Sample,SP_Group_New,RNAseq_Type),cytoband,fill = list(L1exp=0))
tmpsel <- cdata3 %>% filter(Tumor_Barcode %in% hq_samples2) %>% pull(Sample)
library(tidyHeatmap)
library(grid)
#cairo_pdf(file = 'L1-Exp-RNAseq-hq-luad-tumor.pdf',width = 5,height = 8,family = 'Roboto Condensed')
#cairo_pdf(file = './output/L1-Exp-RNAseq-hq-luad-normal.pdf',width = 5,height = 6,family = 'Roboto Condensed')
tdata %>%
#filter(RNAseq_Type =='Tumor') %>%
filter(RNAseq_Type =='Normal') %>%
filter(Sample %in% tmpsel) %>%
group_by(SP_Group_New) %>%
heatmap(Sample,cytoband,L1exp,
#palette_value = circlize::colorRamp2(c(seq(0,20,4),40,60),c('white',viridis::viridis(7))),
palette_value = circlize::colorRamp2(seq(0,6),c('white',viridis::viridis(6))),
rect_gp = grid::gpar(col = "#161616", lwd = 0.1),
column_names_gp = gpar(fontsize = 10),
show_row_names=FALSE,column_title='',row_title='',
heatmap_legend_param=list(at=seq(1,6),labels=2^(seq(1,6)),title= 'LINE-1 RNA (FPM)', title_position = "leftcenter-rot",legend_height=unit(6,"cm")),
palette_grouping = list(sp_group_color)
) #annotation_tile(dataset)
#dev.off()
## overall
tdata <- l1em %>%
select(-dataset,-RNAseq_Type) %>%
filter(Sample %in% cdata3$Sample) %>%
left_join(cdata3) %>%
left_join(wgs_groups_info %>% select(Tumor_Barcode,SP_Group) %>% left_join(sp_group_data2) %>% select(-SP_Group)) %>%
filter(Tumor_Barcode %in% hq_samples2) %>%
group_by(Sample,Tumor_Barcode,SP_Group_New,RNAseq_Type) %>%
summarise(L1exp=sum(L1exp)) %>%
#mutate(L1exp=log2(L1exp)) %>%
ungroup()
tdata %>%
filter(SP_Group_New != 'Others') %>%
ggplot(aes(SP_Group_New,log2(L1exp),fill=RNAseq_Type))+
geom_boxplot(width=0.6,pch=21,outlier.shape = NA,alpha =0.25)+
geom_point(pch=21,size=1.5,position=position_jitterdodge(jitter.width = 0.1,dodge.width = 0.5),color="black")+
scale_fill_manual(values = ncicolpal[c(20,1)])+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
labs(x = "", y = 'LINE-1 RNA (FPM)',fill='Sample Type')+
panel_border(color = 'black',linetype = 1)
#ggsave(filename = './output/L1-Exp-RNAseq-hq-luad-overall.pdf',width = 5,height = 5,device = cairo_pdf)
tdata %>% group_by(SP_Group_New) %>% do(tidy(wilcox.test(L1exp~RNAseq_Type,data=.))) %>% ungroup() %>% filter(SP_Group_New != 'Others') %>% select(SP_Group_New,p.value) %>% arrange(p.value)
## # A tibble: 3 × 2
## SP_Group_New p.value
## <chr> <dbl>
## 1 AS_N 0.00110
## 2 EU_S 0.0162
## 3 EU_N 0.0266
tdata %>% filter(SP_Group_New %in% c('EU_S','AS_N')) %>% group_by(RNAseq_Type) %>% do(tidy(wilcox.test(L1exp~SP_Group_New,data=.))) %>% ungroup() %>% select(RNAseq_Type,p.value) %>% arrange(p.value)
## # A tibble: 2 × 2
## RNAseq_Type p.value
## <chr> <dbl>
## 1 Tumor 0.00000000799
## 2 Normal 0.0000000761
tdata %>% filter(SP_Group_New %in% c('EU_S','EU_N')) %>% group_by(RNAseq_Type) %>% do(tidy(wilcox.test(L1exp~SP_Group_New,data=.))) %>% ungroup() %>% select(RNAseq_Type,p.value) %>% arrange(p.value)
## # A tibble: 2 × 2
## RNAseq_Type p.value
## <chr> <dbl>
## 1 Normal 0.0000382
## 2 Tumor 0.000224
tdata %>% mutate(SP_Group_New=if_else(SP_Group_New == 'AS_N','EU_N',SP_Group_New)) %>% filter(SP_Group_New %in% c('EU_S','EU_N')) %>% group_by(RNAseq_Type) %>% do(tidy(wilcox.test(L1exp~SP_Group_New,data=.))) %>% ungroup() %>% select(RNAseq_Type,p.value) %>% arrange(p.value)
## # A tibble: 2 × 2
## RNAseq_Type p.value
## <chr> <dbl>
## 1 Normal 0.0000000112
## 2 Tumor 0.0000000214
load('./data/sdata.RData')
my_comparisons <- list(c("Smoker", "Non-Smoker"))
tdata %>%
left_join(covdata0) %>%
left_join(
sdata %>% select(Tumor_Barcode,CIGT_CURRENT_STATUS) %>% mutate(CIGT_CURRENT_STATUS = if_else(CIGT_CURRENT_STATUS ==1,'Current Smoker','Former Smoker'))
) %>%
filter(Smoking != 'Unknown') %>%
ggplot(aes(Smoking,log2(L1exp),fill=Smoking))+
geom_boxplot(width=0.6,pch=21,outlier.shape = NA,alpha =0.25)+
geom_point(pch=21,size=1.5,position=position_jitterdodge(jitter.width = 0.5,dodge.width = 0.5),color="black")+
#scale_fill_manual(values = ncicolpal[c(20,1)])+
facet_wrap(~RNAseq_Type)+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
labs(x = "", y = 'LINE-1 RNA (FPM)',fill='Sample Type')+
panel_border(color = 'black',linetype = 1)+
stat_compare_means(comparisons = my_comparisons)
#ggsave(filename = './output/L1-Exp-RNAseq-hq-luad-overall_smoking.pdf',width = 6.5,height = 5,device = cairo_pdf)
my_comparisons <- list(c("Current Smoker", "Non-Smoker"),c("Former Smoker", "Non-Smoker"),c("Current Smoker", "Former Smoker"))
tdata %>%
left_join(covdata0) %>%
left_join(
sdata %>% select(Tumor_Barcode,CIGT_CURRENT_STATUS)
) %>%
mutate(CIGT_CURRENT_STATUS = if_else(is.na(CIGT_CURRENT_STATUS),Smoking,if_else(CIGT_CURRENT_STATUS ==1,'Current Smoker','Former Smoker'))) %>%
filter(Smoking != 'Unknown',CIGT_CURRENT_STATUS != 'Smoker') %>%
ggplot(aes(CIGT_CURRENT_STATUS,log2(L1exp),fill=CIGT_CURRENT_STATUS))+
geom_boxplot(width=0.6,pch=21,outlier.shape = NA,alpha =0.25)+
geom_point(pch=21,size=1.5,position=position_jitterdodge(jitter.width = 0.3,dodge.width = 0.5),color="black")+
#scale_fill_manual(values = ncicolpal[c(20,1)])+
facet_wrap(~RNAseq_Type)+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
labs(x = "", y = 'LINE-1 RNA (FPM)',fill='Sample type')+
panel_border(color = 'black',linetype = 1)+
theme(axis.text.x = element_text(angle = 30,hjust = 1,vjust = 1))+
stat_compare_means(comparisons = my_comparisons)
#ggsave(filename = './output/L1-Exp-RNAseq-hq-luad-overall_smoking2.pdf',width = 8,height = 6,device = cairo_pdf)
# ID2
tdata <- l1em %>%
select(-dataset,-RNAseq_Type) %>%
filter(Sample %in% cdata3$Sample) %>%
left_join(cdata3) %>%
left_join(wgs_groups_info %>% select(Tumor_Barcode,SP_Group) %>% left_join(sp_group_data) %>% select(-SP_Group)) %>%
group_by(Tumor_Barcode,SP_Group_New,RNAseq_Type) %>%
summarise(L1exp=sum(L1exp)) %>%
#mutate(L1exp=log2(L1exp)) %>%
ungroup() %>%
filter(Tumor_Barcode %in% hq_samples2)
tdata <- tdata %>% left_join(id2data)
tdata %>% filter(SP_Group_New != "Others") %>% group_by(SP_Group_New) %>% do(tidy(wilcox.test(L1exp~ID2_Present,data=.))) %>% ungroup() %>% select(SP_Group_New,p.value)
## # A tibble: 3 × 2
## SP_Group_New p.value
## <chr> <dbl>
## 1 AS_N 0.837
## 2 EU_N 0.967
## 3 EU_S 0.841
#%>% select(RNAseq_Type,p.value) %>% arrange(p.value) %>% write_clip()
tdata %>%
filter(SP_Group_New!='Others') %>%
ggplot(aes(SP_Group_New,log2(L1exp),fill=ID2_Present))+
geom_boxplot(width=0.6,pch=21,outlier.shape = NA,alpha =0.25)+
geom_point(pch=21,size=1.5,position=position_jitterdodge(jitter.width = 0.1,dodge.width = 0.5),color="black")+
scale_fill_manual(values = ncicolpal[c(2:1)])+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
labs(x = "", y = 'LINE-1 RNA (FPM)',fill='ID2 Signature')+
panel_border(color = 'black',linetype = 1)
#ggsave(filename = './output/ID2-L1-Exp-RNAseq.pdf',width = 5,height = 6,device = cairo_pdf)
sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS 15.5
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] ComplexHeatmap_2.18.0 circlize_0.4.16 tidyHeatmap_1.11.6
## [4] dunnr_0.2.6 valr_0.8.4 conflicted_1.2.0
## [7] hablar_0.3.2 ggpubr_0.6.1 ggsci_3.2.0
## [10] broom_1.0.8 ggpmisc_0.6.2 ggpp_0.5.9
## [13] ggasym_0.1.6 data.table_1.17.8 ggnewscale_0.5.2
## [16] ggrepel_0.9.6 cowplot_1.2.0 hrbrthemes_0.8.7
## [19] scales_1.4.0 ggsankey_0.0.99999 lubridate_1.9.4
## [22] forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4
## [25] purrr_1.0.4 readr_2.1.5 tidyr_1.3.1
## [28] tibble_3.3.0 ggplot2_3.5.2 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] polynom_1.4-1 gridExtra_2.3 readxl_1.4.5
## [4] rlang_1.1.6 magrittr_2.0.3 clue_0.3-66
## [7] GetoptLong_1.0.5 matrixStats_1.5.0 compiler_4.3.2
## [10] mgcv_1.9-1 png_0.1-8 systemfonts_1.2.3
## [13] vctrs_0.6.5 quantreg_6.1 shape_1.4.6.1
## [16] crayon_1.5.3 pkgconfig_2.0.3 fastmap_1.2.0
## [19] magick_2.8.7 backports_1.5.0 labeling_0.4.3
## [22] utf8_1.2.6 rmarkdown_2.29 tzdb_0.5.0
## [25] MatrixModels_0.5-4 xfun_0.52 cachem_1.1.0
## [28] jsonlite_2.0.0 cluster_2.1.8.1 parallel_4.3.2
## [31] R6_2.6.1 bslib_0.9.0 stringi_1.8.7
## [34] RColorBrewer_1.1-3 car_3.1-3 extrafontdb_1.0
## [37] jquerylib_0.1.4 cellranger_1.1.0 Rcpp_1.1.0
## [40] iterators_1.0.14 knitr_1.50 usethis_3.1.0
## [43] IRanges_2.36.0 extrafont_0.19 Matrix_1.6-5
## [46] splines_4.3.2 timechange_0.3.0 tidyselect_1.2.1
## [49] viridis_0.6.5 rstudioapi_0.17.1 dichromat_2.0-0.1
## [52] abind_1.4-8 yaml_2.3.10 codetools_0.2-20
## [55] doParallel_1.0.17 lattice_0.22-7 withr_3.0.2
## [58] evaluate_1.0.4 survival_3.8-3 pillar_1.11.0
## [61] carData_3.0-5 stats4_4.3.2 foreach_1.5.2
## [64] generics_0.1.4 S4Vectors_0.40.2 hms_1.1.3
## [67] glue_1.8.0 gdtools_0.4.2 tools_4.3.2
## [70] dendextend_1.19.0 SparseM_1.84-2 ggsignif_0.6.4
## [73] distill_1.6 fs_1.6.6 Cairo_1.6-2
## [76] Rttf2pt1_1.3.12 colorspace_2.1-1 patchwork_1.3.1
## [79] nlme_3.1-168 Formula_1.2-5 cli_3.6.5
## [82] fontBitstreamVera_0.1.1 viridisLite_0.4.2 downlit_0.4.4
## [85] gtable_0.3.6 rstatix_0.7.2 sass_0.4.10
## [88] digest_0.6.37 fontquiver_0.2.1 BiocGenerics_0.48.1
## [91] rjson_0.2.23 farver_2.1.2 memoise_2.0.1
## [94] htmltools_0.5.8.1 lifecycle_1.0.4 GlobalOptions_0.1.2
## [97] fontLiberation_0.1.0 MASS_7.3-60.0.1